# -*- coding:UTF-8 -*- 
"""
@Project:   DataCrawler
@FileName:  html_parser.py 
@CreateDate:2023/4/22 23:28  
@Author:    Jia  
@Desc:      URL解析器
-------------------------------------------------------
数据结构：
redis 集合              redis 哈希
-------------------------------------------------------
is_crawled  已爬        first_floor_data  compy:url
no_crawled  未爬
new_urls    新URL
fail_urlS   失败URL
--------------------------------------------------------
"""
import asyncio
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from selenium.webdriver.common.by import By


class HtmlParser:
    def __init__(self, rds):
        """使用redis数据库1"""
        self.r = rds

    def _get_new_urls(self, soup):
        """从传入HTML文本中解析，将新URL添加到待爬取集合中"""

        tags = soup.find_all('a', attrs={'class': 'index_alink__zcia5 link-click'})
        for tag in tags:
            link = tag.attrs['href']
            self.r.sadd('first_floor_urls', link)

    def _get_new_data(self, soup):
        """从传入的HTML文本中解析数据"""

        tags = soup.find_all('a', attrs={'class': 'index_alink__zcia5 link-click'})
        for tag in tags:
            compy = tag.text
            url = tag.attrs['href']
            self.r.hset('first_floor_data', compy, url)

    async def parser(self, html_text):
        """解析传入的HTML文本，返回数据和待爬的新URL"""

        if html_text is None:
            return None
        else:
            soup = BeautifulSoup(html_text, 'html.parser')

            self._get_new_urls(soup)
            self._get_new_data(soup)

    async def parser_second_floor(self, html_text):
        """从第一层数据解析出电话和姓名放到第二层数据"""
        if html_text is None:
            return None
        else:
            soup = BeautifulSoup(html_text, 'html.parser')
            res = soup.find('a', attrs={'class': 'index_link-click__NmHxP'})
            name = res['title']

            res2 = soup.find('span', attrs={'class': 'index_detail-tel__fgpsE'})
            phone = res2.text

            self.r.hset('third_floor_data', name, phone)

    async def open_web(self, url):
        """用selenium爬取页面抓不到的数据"""

        """因为源码中没有加载出完整的数据，所以开始用selenium操作"""
        option = webdriver.ChromeOptions()

        # 屏蔽V76以上自动化受控提示 ，开发者提示
        option.add_experimental_option('useAutomationExtension', False)
        option.add_experimental_option("excludeSwitches", ['enable-automation'])

        # 不自动关闭浏览器
        option.add_experimental_option('detach', True)
        driver = webdriver.Chrome(options=option)

        # url = 'https://www.tianyancha.com/company/5233340769'
        # 使用selenium驱动chrome浏览器
        driver.get(url)
        driver.maximize_window()

        # 等待5秒，让页面加载完全
        sleep(4)
        try:
            company_name = driver.find_element(By.XPATH, '//h1[@class="index_company-name__LqKlo"]').text
            if company_name is not None:
                """使用java_script让浏览器滚动到合适位置；参数：左边距，上边距,smooth平滑划动"""
                js = 'window.scrollTo({left:0, top:2500, behavior: "smooth"});'
                driver.execute_script(js)

                '''使用find_elements获取多个匹配元素值，其返回值是一个element对象不能直接使用'''
                href = driver.find_elements(By.XPATH, '//a[@class="index_lazy-img-toco__EU_FE link-click"]')

                # 输出合伙人名字和链接
                for h in href:
                    link = h.get_property('href')
                    name = h.get_property('text')
                    self.r.hset('second_floor_urls', name, link)

            else:
                self.r.sadd('second_fail_url', url)
        except Exception as error:
            logger.debug(f'错误信息:{error}')
        finally:
            # 关闭浏览器
            driver.close()


if __name__ == '__main__':
    from database import redis_db
    from Config import log
    logger = log.Logs().debug_logger()
    # 调试时使用db0数据库
    r = redis_db.redis_conn(logger)
    h = HtmlParser(r)
    html_content = open('../Data/res_text2.html', encoding='utf-8')

    task1 = asyncio.ensure_future(h.parser_second_floor(html_content))
    task2 = asyncio.ensure_future(h.open_web())
    task3 = asyncio.gather(task1, task2)
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task3)

